/* File: make_main_data.do
 * Author: Luca Maini, Josh Feng
 * Purpose: Create the main dataset for the analysis
 *
 * Date Created: 8/11/2020
 *
 */

/*	The goal of this script is to create a dataset of drug sales that also 
	includes counters for the number of times that each drug was involved in a 
	deal with different characteristics or in varying capacity (e.g. how many 
	times a drug was involved in a company acquisition or in a deal that 
	involved an overlapping product being acquired).
	
	To do this, we start from the sales dataset and merge in the acquisition
	variables we created in the previous script.

*/



/////////////////////////////////////////////////////////////////////////
/////																/////
/////	STEP 1. Format sales dataset for deal characteristics merge	/////
/////																/////
/////////////////////////////////////////////////////////////////////////

* start from the list of acquiror products
use "${outdir}\deal_acquiror_list.dta", clear

* need a dataset uniquely identified by Product and quarter to merge on to the 
* sales dataset
drop if Product == ""			// these won't match to sales (and don't need 
								// to)
rename dealID dealID_acqr	// to distinguish from dealID variables already 
								// in sales data

bysort Product acquisitionQrt : gen seq = _n
reshape wide dealID_acqr, i(Product acquisitionQrt) j(seq)

* merge on to sales dataset
rename acquisitionQrt quarter	// for merge purposes

* a handful of drugs in the master do not match. These are drugs that show up in 
* the sales data in the quarter BEFORE the acquisition, but not in the quarter
* after that. With only 2 exceptions these are products that are either no 
* longer marketed, or that experience LOE in that quarter. The two exceptions 
* are products seemingly still marketed, but for which we don't have data.
merge 1:1 Product quarter ///
	using "${outdir}\ProductLevelSalesSSRwithAcquisition.dta", ///
		keep(using match) nogen

* since we only have data for two quarters in 2020, we remove those datapoints
drop if quarter >= tq(2020q1)

* now build a Product-quarter dataset with indicators for deals in the quarter 
* they happened

gen dealID_acqd = 0
gen acqd = 0			
label var acqd "Indicator for being acquired"

* this loop updates the variable but only for acquired drugs
forvalues i = 1/4 {
	replace acqd = 1 if quarter == acquisitionQrt`i'
	replace dealID_acqd = dealID`i' if quarter == acquisitionQrt`i'
	
	// drop dealID`i' acquisitionQrt`i' dealCategory`i'	// no longer needed
	drop dealCategory`i'	// no longer needed
	}

* Now do the same for acquirer (different structure because the variables that
* contain the information are different)
gen acqr = 0
label var acqr "Indicator for owned by acquirer"

forvalues i = 1/3 {
	replace dealID_acqr`i' = 0 if dealID_acqr`i' == .
	replace acqr = acqr + 1 if dealID_acqr`i' != 0	// notice that we sum for each deal 
													// because there can be more than one
													// acquisition in any given period
	}

sort Product quarter
drop imputation_flag AcquirorName





////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////





/////////////////////////////////////////////////////////////
/////													/////
/////	STEP 2. Merge in drug and deal characteristics 	/////
/////													/////
/////////////////////////////////////////////////////////////

/*	Now we want to merge deal and drug characteristics in. The datasets with the
	information we want are:
	
		1.	acquisition_characteristics.dta -> contains deal-level indicators
		2.	drug_overlap_measures.dta -> contains drug-level indicators for 
			overlap
		3.	drug_blockbuster_measures -> contains drug-level indicators for 
			blockbuster deals
		4.	portfolio_metrics -> company-level indicators for portfolio gains
			and losses

*/




***
*** 2.1 start with acquisition_characteristics.dta
***
local dealCharList "company_acq large_deal lg_acq sm_acq valeant medVH ea_ind stlth"

** We merge four times, one for each dealID that is potentially associated to a 
** product and quarter

*** Acquired
* prep variable for merging
gen dealID = dealID_acqd

* 11 deals do not match. That's because that company's product was not recording
* sales at the time of acquisition.
merge m:1 dealID using "${maindir}\acquisition_characteristics.dta", ///
	nogen keep(master match) keepusing("`dealCharList'")

foreach var of local dealCharList {
	replace `var' = 0 if `var' == .
	rename `var' `var'_acqd
	}

drop dealID

*** Acquirer (follows similar steps, but loops over three variables)
forvalues d = 1/3 {

	gen dealID = dealID_acqr`d'
	
	merge m:1 dealID using "${maindir}\acquisition_characteristics.dta", ///
		nogen keep(master match)
	
	foreach var of local dealCharList {
		replace `var' = 0 if `var' == .
		rename `var' `var'_acqr`d'
		}
	
	drop dealID
	}
	
* Condense the acquirer variables (do this by simply summing them up, so that if
* there are multiple acquisitions in a single quarter, we simply count each)
foreach var of local dealCharList {
	gen `var'_acqr = `var'_acqr1 + `var'_acqr2 + `var'_acqr3
	drop `var'_acqr1 `var'_acqr2 `var'_acqr3
	}

* update labels
foreach var of local dealCharList {
	local lbl : variable label `var'_acqd
    label var `var'_acqd "`lbl' - Acquired"
	label var `var'_acqr "`lbl' - Owned"
	}


	
***
*** 2.2	Next, merge in the drug-deal variables
***

*** Blockbuster variables

gen acquisitionQrt = quarter	// for merge purposes

* acquired products
gen dealID = dealID_acqd

merge m:1 dealID Product acquisitionQrt ///
	using "${outdir}\drug_blockbuster_measures.dta", keep(master match) ///
		keepusing(acquired_by_blockbuster) nogen

replace acquired_by_blockbuster = 0 if acquired_by_blockbuster == .
drop dealID

* acquirer products
forvalues d = 1/3 {
	gen dealID = dealID_acqr`d'
	
	merge m:1 dealID Product acquisitionQrt ///
		using "${outdir}\drug_blockbuster_measures.dta", keep(master match) ///
			keepusing(blockbuster_acquisition) nogen
	
	replace blockbuster_acquisition = 0 if blockbuster_acquisition == .
	rename blockbuster_acquisition blockbuster_acquisition`d'
	
	drop dealID
	}

* Now, again, condense the acquirer variables
gen blockbuster_acquisition = blockbuster_acquisition1 + ///
	blockbuster_acquisition2 + blockbuster_acquisition3
drop blockbuster_acquisition1 blockbuster_acquisition2 blockbuster_acquisition3

* shorten and homologate names
rename blockbuster_acquisition bb_acq_acqr
rename acquired_by_blockbuster acq_by_bb_acqd

label var bb_acq_acqr "Acquisition of blockbuster - Owned"
label var acq_by_bb_acqd "Acquired by company that owns a blockbuster - Acquired"

* create interaction variables between blockbuster and larger acquirer
gen lg_acq_acq_by_bb_acqd = lg_acq_acqd * (acq_by_bb_acqd > 0 & acq_by_bb_acqd < .)
label var lg_acq_acq_by_bb_acqd "Acquired by large company that owns a blockbuster - Acquired"

*** Overlap variables

** acquired drugs
gen dealID = dealID_acqd

merge m:1 dealID Product acquisitionQrt ///
	using "${maindir}\drug_overlap_measures.dta", ///
		keep(master match) keepusing(acqd_drug_same_atc3 ///
									 acqd_drug_same_atc4) nogen

replace acqd_drug_same_atc3 = 0 if acqd_drug_same_atc3 == .
replace acqd_drug_same_atc4 = 0 if acqd_drug_same_atc4 == .
drop dealID

* acquirer drugs
forvalues d = 1/3 {
	gen dealID = dealID_acqr`d'
	
	merge m:1 dealID Product acquisitionQrt ///
		using "${maindir}\drug_overlap_measures.dta", ///
			keep(master match) keepusing(acqr_drug_same_atc3 ///
										 acqr_drug_same_atc4) nogen
	
	replace acqr_drug_same_atc3 = 0 if acqr_drug_same_atc3 == .
	replace acqr_drug_same_atc4 = 0 if acqr_drug_same_atc4 == .
	
	rename acqr_drug_same_atc3 acqr_drug_same_atc3`d'
	rename acqr_drug_same_atc4 acqr_drug_same_atc4`d'
	
	drop dealID
	}

* Now, again, condense the acquirer variables
foreach var in acqr_drug_same_atc3 acqr_drug_same_atc4 {
	gen `var' = `var'1 + `var'2 + `var'3
	drop `var'1 `var'2 `var'3
	}

* rename overlap variables to keep naming consistency
rename acqd_drug_same_atc3 same_atc3_acqd
rename acqd_drug_same_atc4 same_atc4_acqd
rename acqr_drug_same_atc3 same_atc3_acqr
rename acqr_drug_same_atc4 same_atc4_acqr


* create interaction variables between overlap variables and a few others
foreach var in stlth lg_acq sm_acq {
	gen `var'_same_atc3_acqr = `var'_acqr * (same_atc3_acqr > 0 & same_atc3_acqr < .)
	gen `var'_same_atc3_acqd = `var'_acqd * (same_atc3_acqd > 0 & same_atc3_acqd < .)
	gen `var'_same_atc4_acqr = `var'_acqr * (same_atc4_acqr > 0 & same_atc4_acqr < .)
	gen `var'_same_atc4_acqd = `var'_acqd * (same_atc4_acqd > 0 & same_atc4_acqd < .)
	}

label var same_atc3_acqd "ATC-3 overlap deal - Acquired"
label var same_atc4_acqd "ATC-4 overlap deal - Acquired"
label var same_atc3_acqr "ATC-3 overlap deal - Owned"
label var same_atc4_acqr "ATC-4 overlap deal - Owned"
label var stlth_same_atc3_acqd "Stealth, ATC-3 overlap deal - Acquired"
label var stlth_same_atc3_acqr "Stealth, ATC-3 overlap deal - Owned"
label var stlth_same_atc4_acqd "Stealth, ATC-4 overlap deal - Acquired"
label var stlth_same_atc4_acqr "Stealth, ATC-4 overlap deal - Owned"
label var sm_acq_same_atc3_acqd "Smaller acquirer, ATC-3 overlap deal - Acquired"
label var sm_acq_same_atc3_acqr "Smaller acquirer, ATC-3 overlap deal - Owned"
label var lg_acq_same_atc3_acqd "Larger acquirer, ATC-3 overlap deal - Acquired"
label var lg_acq_same_atc3_acqr "Larger acquirer, ATC-3 overlap deal - Owned"


	
*** Now, repeat the steps to merge in also the Indication and Class variables

** Indication
gen dealID = dealID_acqd

merge m:1 dealID Product acquisitionQrt ///
	using "${outdir}\drug_overlap_indication_measures.dta", ///
		keep(master match) keepusing(acqd_drug_same_indication) nogen

replace acqd_drug_same_indication = 0 if acqd_drug_same_indication == .
drop dealID

* acquirer drugs
forvalues d = 1/3 {
	gen dealID = dealID_acqr`d'
	
	merge m:1 dealID Product acquisitionQrt ///
		using "${outdir}\drug_overlap_indication_measures.dta", ///
			keep(master match) keepusing(acqr_drug_same_indication) nogen
	
	replace acqr_drug_same_indication = 0 if acqr_drug_same_indication == .
	
	rename acqr_drug_same_indication acqr_drug_same_indication`d'
	
	drop dealID
	}

* Now, again, condense the acquirer variables
foreach var in acqr_drug_same_indication {
	gen `var' = `var'1 + `var'2 + `var'3
	drop `var'1 `var'2 `var'3
	}

* rename overlap variables to keep naming consistency
rename acqd_drug_same_indication same_ind_acqd
rename acqr_drug_same_indication same_ind_acqr


label var same_ind_acqd "Indication overlap deal - Acquired"
label var same_ind_acqr "Indication overlap deal - Owned"

** Class
gen dealID = dealID_acqd

merge m:1 dealID Product acquisitionQrt ///
	using "${outdir}\drug_overlap_class_measures.dta", ///
		keep(master match) keepusing(acqd_drug_same_class) nogen

replace acqd_drug_same_class = 0 if acqd_drug_same_class == .
drop dealID

* acquirer drugs
forvalues d = 1/3 {
	gen dealID = dealID_acqr`d'
	
	merge m:1 dealID Product acquisitionQrt ///
		using "${outdir}\drug_overlap_class_measures.dta", ///
			keep(master match) keepusing(acqr_drug_same_class) nogen
	
	replace acqr_drug_same_class = 0 if acqr_drug_same_class == .
	
	rename acqr_drug_same_class acqr_drug_same_class`d'
	
	drop dealID
	}

* Now, again, condense the acquirer variables
foreach var in acqr_drug_same_class {
	gen `var' = `var'1 + `var'2 + `var'3
	drop `var'1 `var'2 `var'3
	}

* rename overlap variables to keep naming consistency
rename acqd_drug_same_class same_cl_acqd
rename acqr_drug_same_class same_cl_acqr


label var same_cl_acqd "Class overlap deal - Acquired"
label var same_cl_acqr "Class overlap deal - Owned"


* drop dealID variables -- no longer needed
drop dealID_acqd dealID_acqr*


***
***	2.3 Portfolio metrics
***

* the only unmatched observations are in 2020
merge 1:1 Product quarter using "${outdir}/portfolio_metrics.dta", ///
	keep(master match) nogen

* missings are zeros
replace portfolio_gain = 0 if portfolio_gain == .
replace portfolio_loss = 0 if portfolio_loss == .
	
* name edits for code consistency
rename portfolio_gain portfolio_gain_acqr
rename portfolio_loss portfolio_loss_acqd

label var portfolio_gain_acqr "Acquisition leading to large portfolio increase - Acquirer"
label var portfolio_loss_acqd "Acquisition leading to large portfolio decrease - Owned by target"

* Interact the large portfolio gain variable with acquisition by a larger 
* acquirer
foreach var in lg_acq {
	gen `var'_port_gain_acqr = `var'_acqr * (portfolio_gain_acqr > 0 & portfolio_gain_acqr < .)
	}

label var lg_acq_port_gain_acqr "Larger acquirer with acquisition leading to large portfolio increase - Acquirer"



*** 
*** 2.4 Finally, merge in the acq_in_class indicators
***

** acquired drugs

merge m:1 Product acquisitionQrt ///
	using "${outdir}\drug_acq_in_class_1.dta", ///
		keep(master match) keepusing(acq_in_atc4) nogen

merge m:1 Product acquisitionQrt ///
	using "${outdir}\drug_acq_in_class_2.dta", ///
		keep(master match) keepusing(ov_acq_in_atc3) nogen

merge m:1 Product acquisitionQrt ///
	using "${outdir}\drug_acq_in_class_3.dta", ///
		keep(master match) keepusing(ov_acq_atc4) nogen

merge m:1 Product acquisitionQrt ///
	using "${outdir}\drug_acq_in_class_4.dta", ///
		keep(master match) keepusing(stlth_ov_acq_in_atc3) nogen

merge m:1 Product acquisitionQrt ///
	using "${outdir}\drug_acq_in_class_5.dta", ///
		keep(master match) keepusing(stlth_ov_acq_atc4) nogen

foreach var in acq_in_atc4 ov_acq_in_atc3 ov_acq_atc4 stlth_ov_acq_in_atc3 stlth_ov_acq_atc4 {
	replace `var' = 0 if `var' == .
	}

label var acq_in_atc4 "Acquisition in ATC-4 - Drug not involved"
label var ov_acq_in_atc3 "Overlap acquisition in ATC-3 - Drug not involved"
label var ov_acq_atc4 "Overlap acquisition in ATC-4 - Drug not involved"
label var stlth_ov_acq_in_atc3 "Stealth horizontal acquisition in ATC-3 - Drug not involved"
label var stlth_ov_acq_atc4 "Stealth horizontal acquisition in ATC-4 - Drug not involved"

drop acquisitionQrt


* Now create interaction variables between Valeant and some indicators
foreach var in same_atc3 {
	gen v_`var'_acqr = `var'_acqr * (valeant_acqr >= 1)
	gen v_`var'_acqd = `var'_acqd * valeant_acqd	// the ACQUIRED variables are binary	
	}

label var v_same_atc3_acqd "Valeant, same ATC-3 overlap deal - Acquired"
label var v_same_atc3_acqr "Valeant, same ATC-3 overlap deal - Owned"


////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////





/////////////////////////////////////////////////////////
/////												/////
/////	STEP 3. Collapse to the Product-year level	/////
/////												/////
/////////////////////////////////////////////////////////

foreach var of varlist *acqd* *acqr* {
	bysort Product (quarter) : gen `var'_count = sum(`var')
	local lbl : variable label `var'
    label var `var'_count "`lbl' - COUNT"

	}


/*	Now collapse to the yearly level. Rules:
	
		1.	For counter variables we take the last value in a given year (which 
			is the same as the max value since counters only ever increase in 
			value) -- THIS IS THE SAME AS IN THE PREVIOUS CODE BY JOSH
		2.	For marketing company we take the last company (i.e. the company 
			that was marketing the product on the last day of the year)

*/

gen year = yofd(dofq(quarter))

sort Product quarter


* save labels
 foreach var of varlist _all {
	local l`var' : variable label `var'
	}

* ensure missing sales don't get collapsed to 0
gen any_valid_net = (net_sales != .)
gen any_valid_WAC = (WAC_sales != .)	// valid for units too

* Create units variable for the net price data (fixes issue where we have units
* data for the full year, but only net_sales data for part of the year)
gen units_netComp = units if net_sales != .
gen WAC_sales_netComp = WAC_sales if net_sales != .


collapse (sum) WAC_sales net_sales units units_netComp WAC_sales_netComp any_valid* ///
		 (last) *count mktCompany ///
		 (first) mkt_start mkt_end LOE dealID* acquisitionQrt* ///
		 (max) valeant_acqd valeant_acqr acqd acqr, by(Product year)


/*	Update for Valeant: Since Valeant performs 20+ acquisitions in 5 years, we 
	need to make some adjustment to make sense of the coefficients. First, we 
	use a single step in each year in which valeant has AT LEAST one 
	acquisition. Then, if a drug is acquired in a year, and then Valeant makes 
	another acquisition, we do NOT count being owned by an acquiring company for 
	that drug and year.
*/

foreach var in valeant_acqr_count v_same_atc3_acqr_count {
	bysort Product (year) : gen temp = ///
		(`var' != `var'[_n-1] & `var'[_n-1] != .) &		/// checks that one acquisition took place
		valeant_acqd == 0	// checks that the product was not acquired in that period
	bysort Product (year) : gen `var'x = sum(temp)
	drop temp
	}

* re-calculate dependent variables
replace net_sales = . if any_valid_net == 0
replace units_netComp = . if any_valid_net == 0
replace WAC_sales_netComp = . if any_valid_net == 0
replace WAC_sales = . if any_valid_WAC == 0
replace units = . if any_valid_WAC == 0

drop any_valid*

* re-attach labels
foreach var of varlist _all {
	label var `var' "`l`var''"
	} 

* drop years on or after LOE
bysort Product (year) : egen temp = max(LOE)
drop LOE
rename temp LOE
format LOE %td

drop if year >= year(LOE)

* drop years before marketing start
bysort Product (year) : egen temp = max(mkt_start)
drop mkt_start
rename temp mkt_start
format mkt_start %td

drop if year < yofd(mkt_start)

* re-create price variables
gen WAC_price = WAC_sales/units
gen net_price = net_sales/units_netComp

* Drop wrong WAC datapoint for VIMOVO (corrected in later versions of the SSR 
* Health data)
replace WAC_price = . if year == 2010 & Product == "VIMOVO"


* create variables for the analysis
gen log_WAC = log(WAC_price)
gen log_net = log(net_price)

* Winsorized net price variable
gen discount = 1 - net_sales / WAC_sales_netComp

summ discount, detail
local lb_disc = r(p5)

gen w_net_price = net_price
replace w_net_price = WAC_price * (1-`lb_disc') if discount < `lb_disc'

gen w_log_net = log(w_net_price)

drop units_netComp WAC_sales_netComp	// no longer needed

gen log_units = log(units)
gen log_sales = log(net_sales)

* get additional variables
gen age = year - year(mkt_start)
bysort Product : egen min_age = min(age)
replace age = age - min_age if min_age < 0	// affects a small number of obs
drop min_age

* merge with formulary data (notice that we will have the acquisition variables 
* coded only for observations that match the sales data. This is why we keep 
* master match). We do not drop master observations that do not match because
* that would mean getting rid of (many) drugs whose formulary information we
* don't have.
merge 1:1 Product year ///
	using "${outdir}/formulary_data_cln_year-drug.dta", ///
		nogen keep(master match)

label var frcov "Fraction of lives with access"
label var frunr "Fraction of lives with unrestricted access" 
label var frpre "Fraction of lives with preferred access"
label var glp "Weighted average for access metric"

* save the data in two formats: with, and without valeant drugs
bysort Product : egen valeant_ind = max(regexm(mktCompany, "VALEANT PHARMACEUTICALS"))
// gen valeant_ind = regexm(mktCompany, "VALEANT PHARMACEUTICALS")

save "${maindir}\combined_regression_dataset_with_valeant.dta", replace

* drop Valeant data
drop if valeant_ind == 1	// this is the more conservative approach
drop valeant_ind

* save the data
save "${maindir}\combined_regression_dataset.dta", replace
